suppressPackageStartupMessages(library(tidyverse))
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
devtools::load_all('~/Google Drive/My Drive/Scripts/R_packages/myUtilities/')
## ℹ Loading myUtilities
wd <- "/Users/s-mitsutomi/My Drive (shuheimitsutomi@ric.u-tokyo.ac.jp)/Analysis/METTL2A/"
setwd(wd)

figdir <- paste0(wd, 'Figures/Espresso/Expression/')
tabledir <- paste0(wd, 'Tables/Espresso/')

Read espresso deseq2 result

add_genetype2 <- function(df) {
  
  df |> 
    mutate(
      genetype2 = case_when(
        gene_type == 'protein_coding' & seqname == 'chrM' ~ 'mt-mRNA',
        gene_type == 'protein_coding' & seqname != 'chrM' ~ 'mRNA',
        grepl('MT-RNR', gene_name) & seqname == 'chrM' ~ 'mt-rRNA',
        grepl('MT-T', gene_name) & seqname == 'chrM' ~ 'mt-tRNA',
        is.na(gene_type) ~ 'unannotated gene', 
        .default = 'other ncRNAs' 
      )
    )
  
}

add_isDET <- function(df) {
  
  df |> 
    rowwise() |> 
    mutate(
      isUp = case_when(
        max(siMETTL2A_G_pvalue, siMETTL2A_I_pvalue) < .05 & 
          min(siMETTL2A_G_log2FoldChange, siMETTL2A_I_log2FoldChange) > 0
        ~ 'common',
        siMETTL2A_G_pvalue < .05 & siMETTL2A_G_log2FoldChange > 0
        ~ 'only G',
        siMETTL2A_I_pvalue < .05 & siMETTL2A_I_log2FoldChange > 0
        ~ 'only I',
        .default = 'not'),
      isDown = case_when(
        max(siMETTL2A_G_pvalue, siMETTL2A_I_pvalue) < .05 & 
          max(siMETTL2A_G_log2FoldChange, siMETTL2A_I_log2FoldChange) < 0
        ~ 'common',
        siMETTL2A_G_pvalue < .05 & siMETTL2A_G_log2FoldChange < 0
        ~ 'only G',
        siMETTL2A_I_pvalue < .05 & siMETTL2A_I_log2FoldChange < 0
        ~ 'only I',
        .default = 'not')
    ) |> 
    mutate(
      common_DETs = case_when(
        isUp   == 'common' ~ 'up',
        isDown == 'common' ~ 'down',
        .default = 'other'
      )
    ) |> 
    ungroup()
  
}
 
calc_percentage <- function(df) {
  
  df |> 
    reframe(n = n()) |> 
    mutate(percentage = 100 * n / sum(n)) |> 
    arrange(-percentage)
  
}


espresso_deseq2_genetype2_isDET <- 
  read_tsv(paste0(wd, 'Tables/Espresso/espresso_DESeq2_2024-03-30.tsv')) |> 
  add_genetype2() |> 
  add_isDET()
## Rows: 36717 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (7): transcript_id, transcript_type, transcript_name, gene_id, gene_typ...
## dbl (18): siMETTL2A_baseMean, siMETTL2A_log2FoldChange, siMETTL2A_lfcSE, siM...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
espresso_deseq2_genetype2_isDET |> 
  export_tsv(outdir = tabledir)
## 
## Exported to: /Users/s-mitsutomi/My Drive (shuheimitsutomi@ric.u-tokyo.ac.jp)/Analysis/METTL2A/Tables/Espresso/espresso_deseq2_genetype2_isDET_2024-04-18.tsv
## # A tibble: 36,717 × 29
##    transcript_id     transcript_type transcript_name gene_id gene_type gene_name
##    <chr>             <chr>           <chr>           <chr>   <chr>     <chr>    
##  1 ENST00000498442.1 retained_intron CRBN-212        ENSG00… protein_… CRBN     
##  2 ENST00000459840.5 retained_intron CRBN-205        ENSG00… protein_… CRBN     
##  3 ENST00000231948.9 protein_coding  CRBN-201        ENSG00… protein_… CRBN     
##  4 ENST00000432408.6 protein_coding  CRBN-203        ENSG00… protein_… CRBN     
##  5 ENST00000339437.… protein_coding  TRNT1-203       ENSG00… protein_… TRNT1    
##  6 ENST00000488263.5 retained_intron CRBN-209        ENSG00… protein_… CRBN     
##  7 ENST00000420393.5 protein_coding  TRNT1-207       ENSG00… protein_… TRNT1    
##  8 ENST00000698415.1 retained_intron TRNT1-230       ENSG00… protein_… TRNT1    
##  9 ENST00000450014.1 protein_coding  CRBN-204        ENSG00… protein_… CRBN     
## 10 ENST00000698416.1 retained_intron TRNT1-231       ENSG00… protein_… TRNT1    
## # ℹ 36,707 more rows
## # ℹ 23 more variables: siMETTL2A_baseMean <dbl>,
## #   siMETTL2A_log2FoldChange <dbl>, siMETTL2A_lfcSE <dbl>,
## #   siMETTL2A_stat <dbl>, siMETTL2A_pvalue <dbl>, siMETTL2A_padj <dbl>,
## #   siMETTL2A_I_baseMean <dbl>, siMETTL2A_I_log2FoldChange <dbl>,
## #   siMETTL2A_I_lfcSE <dbl>, siMETTL2A_I_stat <dbl>, siMETTL2A_I_pvalue <dbl>,
## #   siMETTL2A_I_padj <dbl>, siMETTL2A_G_baseMean <dbl>, …

Expression level

espresso_deseq2_genetype2_isDET |> 
  ggplot(aes(x = siMETTL2A_baseMean)) + 
  geom_histogram() +
  scale_x_log10() +
  geom_vline(xintercept = c(0.1, 1))
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 340 rows containing non-finite outside the scale range
## (`stat_bin()`).

Correlation between two siRNAs

espresso_deseq2_genetype2_isDET |> 
  plot_2dhistogram(
    x = siMETTL2A_G_log2FoldChange, y = siMETTL2A_I_log2FoldChange, 
    save_outdir = figdir, base_size = 7, width = 6, height = 6, 
    axis_lim = c(-10, 10)
  )
## Warning in cor.test.default(x = mf[[1L]], y = mf[[2L]], ...): Cannot compute
## exact p-value with ties
## # A tibble: 2 × 9
##   estimate statistic p.value method  method_short alternative parameter conf.low
##      <dbl>     <dbl>   <dbl> <chr>   <chr>        <chr>           <int>    <dbl>
## 1    0.440   2.72e12       0 Spearm… Spearman     two.sided          NA   NA    
## 2    0.474   9.43e 1       0 Pearso… Pearson      two.sided       30784    0.465
## # ℹ 1 more variable: conf.high <dbl>
## # A tibble: 2 × 1
##   msg                            
##   <chr>                          
## 1 Spearman: r = 0.44, p < 2.2e-16
## 2 Pearson: r = 0.47, p < 2.2e-16
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin2d()`).
## Warning: Removed 1 row containing non-finite outside the scale range (`stat_bin2d()`).
## Removed 1 row containing non-finite outside the scale range (`stat_bin2d()`).
## Removed 1 row containing non-finite outside the scale range (`stat_bin2d()`).
## Removed 1 row containing non-finite outside the scale range (`stat_bin2d()`).

espresso_deseq2_2dhistogram <- 
  espresso_deseq2_genetype2_isDET |> 
  ggplot(aes(x = siMETTL2A_G_log2FoldChange, y = siMETTL2A_I_log2FoldChange)) +
  geom_hex(bins = 50) +
  scale_fill_viridis_c(trans = 'log10') +
  lims(x = c(-10, 10), y = c(-10, 10)) +
  tune::coord_obs_pred(ratio = 1) 
espresso_deseq2_2dhistogram |> 
  ggsave_multiple_formats(
    width = 5, height = 5, fontsize = 7, outdir = figdir
  )
## Warning: Removed 5932 rows containing non-finite outside the scale range
## (`stat_binhex()`).
## Warning: Removed 5932 rows containing non-finite outside the scale range
## (`stat_binhex()`).
## Removed 5932 rows containing non-finite outside the scale range
## (`stat_binhex()`).
## Removed 5932 rows containing non-finite outside the scale range
## (`stat_binhex()`).
## Removed 5932 rows containing non-finite outside the scale range
## (`stat_binhex()`).

espresso_deseq2_genetype2_isDET |> 
  group_by(isUp, isDown) |>
  calc_percentage()
## # A tibble: 9 × 4
##   isUp   isDown     n percentage
##   <chr>  <chr>  <int>      <dbl>
## 1 not    not    32016     87.2  
## 2 not    only I  1059      2.88 
## 3 only I not      872      2.37 
## 4 only G not      862      2.35 
## 5 not    only G   677      1.84 
## 6 not    common   540      1.47 
## 7 common not      438      1.19 
## 8 only G only I   159      0.433
## 9 only I only G    94      0.256
espresso_deseq2_genetype2_isDET |> 
  group_by(isUp) |>
  calc_percentage()
## # A tibble: 4 × 3
##   isUp       n percentage
##   <chr>  <int>      <dbl>
## 1 not    34292      93.4 
## 2 only G  1021       2.78
## 3 only I   966       2.63
## 4 common   438       1.19
espresso_deseq2_genetype2_isDET |> 
  group_by(isDown) |>
  calc_percentage()
## # A tibble: 4 × 3
##   isDown     n percentage
##   <chr>  <int>      <dbl>
## 1 not    34188      93.1 
## 2 only I  1218       3.32
## 3 only G   771       2.10
## 4 common   540       1.47
espresso_deseq2_genetype2_isDET |> 
  group_by(common_DETs) |> 
  calc_percentage()
## # A tibble: 3 × 3
##   common_DETs     n percentage
##   <chr>       <int>      <dbl>
## 1 other       35739      97.3 
## 2 down          540       1.47
## 3 up            438       1.19

# of genetypes in DETs

espresso_deseq2_commonup <- 
  espresso_deseq2_genetype2_isDET |> 
  filter(common_DETs == 'up')
espresso_deseq2_commonup |> 
  export_tsv(outdir = tabledir)
## 
## Exported to: /Users/s-mitsutomi/My Drive (shuheimitsutomi@ric.u-tokyo.ac.jp)/Analysis/METTL2A/Tables/Espresso/espresso_deseq2_commonup_2024-04-18.tsv
## # A tibble: 438 × 29
##    transcript_id     transcript_type transcript_name gene_id gene_type gene_name
##    <chr>             <chr>           <chr>           <chr>   <chr>     <chr>    
##  1 ENST00000307839.… protein_coding  RPL15-201       ENSG00… protein_… RPL15    
##  2 ENST00000306627.8 protein_coding  UBE2E1-201      ENSG00… protein_… UBE2E1   
##  3 ENST00000438607.2 protein_coding  TMA7-201        ENSG00… protein_… TMA7     
##  4 ENST00000477624.1 retained_intron TMA7-202        ENSG00… protein_… TMA7     
##  5 ENST00000273258.4 protein_coding  ARL6IP5-201     ENSG00… protein_… ARL6IP5  
##  6 ENST00000355354.… protein_coding  CD47-201        ENSG00… protein_… CD47     
##  7 ENST00000264538.4 protein_coding  IFT57-201       ENSG00… protein_… IFT57    
##  8 ENST00000265062.8 protein_coding  RAB7A-201       ENSG00… protein_… RAB7A    
##  9 ENST00000451728.6 protein_coding  CNBP-204        ENSG00… protein_… CNBP     
## 10 ENST00000354910.… protein_coding  ANAPC13-201     ENSG00… protein_… ANAPC13  
## # ℹ 428 more rows
## # ℹ 23 more variables: siMETTL2A_baseMean <dbl>,
## #   siMETTL2A_log2FoldChange <dbl>, siMETTL2A_lfcSE <dbl>,
## #   siMETTL2A_stat <dbl>, siMETTL2A_pvalue <dbl>, siMETTL2A_padj <dbl>,
## #   siMETTL2A_I_baseMean <dbl>, siMETTL2A_I_log2FoldChange <dbl>,
## #   siMETTL2A_I_lfcSE <dbl>, siMETTL2A_I_stat <dbl>, siMETTL2A_I_pvalue <dbl>,
## #   siMETTL2A_I_padj <dbl>, siMETTL2A_G_baseMean <dbl>, …
espresso_deseq2_commondown <- 
  espresso_deseq2_genetype2_isDET |> 
  filter(common_DETs == 'down')
espresso_deseq2_commondown |> 
  export_tsv(outdir = tabledir)
## 
## Exported to: /Users/s-mitsutomi/My Drive (shuheimitsutomi@ric.u-tokyo.ac.jp)/Analysis/METTL2A/Tables/Espresso/espresso_deseq2_commondown_2024-04-18.tsv
## # A tibble: 540 × 29
##    transcript_id     transcript_type transcript_name gene_id gene_type gene_name
##    <chr>             <chr>           <chr>           <chr>   <chr>     <chr>    
##  1 ENST00000301964.7 protein_coding  TADA3-201       ENSG00… protein_… TADA3    
##  2 ENST00000383817.5 protein_coding  CIDEC-202       ENSG00… protein_… CIDEC    
##  3 ENST00000344629.… protein_coding  OGG1-205        ENSG00… protein_… OGG1     
##  4 ENST00000306024.4 protein_coding  LSM3-201        ENSG00… protein_… LSM3     
##  5 ENST00000479563.5 retained_intron RPL14-208       ENSG00… protein_… RPL14    
##  6 ENST00000338970.… protein_coding  RPL14-201       ENSG00… protein_… RPL14    
##  7 ENST00000383729.9 protein_coding  P4HTM-202       ENSG00… protein_… P4HTM    
##  8 ENST00000326739.9 protein_coding  IMPDH2-201      ENSG00… protein_… IMPDH2   
##  9 ENST00000308388.7 protein_coding  GMPPB-202       ENSG00… protein_… GMPPB    
## 10 ENST00000417626.8 protein_coding  IFRD2-202       ENSG00… protein_… IFRD2    
## # ℹ 530 more rows
## # ℹ 23 more variables: siMETTL2A_baseMean <dbl>,
## #   siMETTL2A_log2FoldChange <dbl>, siMETTL2A_lfcSE <dbl>,
## #   siMETTL2A_stat <dbl>, siMETTL2A_pvalue <dbl>, siMETTL2A_padj <dbl>,
## #   siMETTL2A_I_baseMean <dbl>, siMETTL2A_I_log2FoldChange <dbl>,
## #   siMETTL2A_I_lfcSE <dbl>, siMETTL2A_I_stat <dbl>, siMETTL2A_I_pvalue <dbl>,
## #   siMETTL2A_I_padj <dbl>, siMETTL2A_G_baseMean <dbl>, …
add_yrange <- function(df) {
  
  new_df <-  df |> 
    mutate(ymax = cumsum(percentage / 100))
  new_df$ymin <- c(0, head(new_df$ymax, n = -1))
  return(new_df)
  
}

donutplot_genetype2 <- function(df, color_values) {
  
  df |> 
    add_yrange() |> 
    ggplot(aes(
      xmin = 2, xmax = 4, ymin = ymin, ymax = ymax,
      fill = genetype2, colour = genetype2         
    )) +
    geom_rect() +
    coord_polar(theta = 'y') +  
    ggrepel::geom_text_repel(
      aes(label = genetype2, y = (ymin + ymax) / 2), x = 1
    ) +
    xlim(c(-1,4)) +
    scale_fill_manual(values = color_values) +
    scale_color_manual(values = color_values) +
    theme_void() 
  
}

espresso_deseq2_commonups_genetype_summary <- 
  espresso_deseq2_commonup |> 
  group_by(genetype2) |> 
  calc_percentage() |> 
  add_yrange()
espresso_deseq2_commonups_genetype_summary
## # A tibble: 5 × 5
##   genetype2            n percentage  ymax  ymin
##   <chr>            <int>      <dbl> <dbl> <dbl>
## 1 mRNA               401     91.6   0.916 0    
## 2 other ncRNAs        16      3.65  0.952 0.916
## 3 mt-mRNA             11      2.51  0.977 0.952
## 4 unannotated gene     8      1.83  0.995 0.977
## 5 mt-rRNA              2      0.457 1     0.995
espresso_deseq2_commondowns_genetype_summary <- 
  espresso_deseq2_commondown |> 
  group_by(genetype2) |> 
  calc_percentage() |> 
  add_yrange()
espresso_deseq2_commondowns_genetype_summary
## # A tibble: 3 × 5
##   genetype2            n percentage  ymax  ymin
##   <chr>            <int>      <dbl> <dbl> <dbl>
## 1 mRNA               527     97.6   0.976 0    
## 2 other ncRNAs        11      2.04  0.996 0.976
## 3 unannotated gene     2      0.370 1     0.996
espresso_deseq2_commonups_genetype_donutplot <- 
  espresso_deseq2_commonups_genetype_summary |> 
  donutplot_genetype2(
    color_values = c('#3e98f2', '#f2983e', '#f23e98', 'grey30', '#3ef298')
  )
espresso_deseq2_commonups_genetype_donutplot |> 
  ggsave_multiple_formats(
    outdir = figdir, width = 5, height = 5, fontsize = 7
  )

espresso_deseq2_commondowns_genetype_donutplot <- 
  espresso_deseq2_commondowns_genetype_summary |> 
  donutplot_genetype2(
    color_values = c('#3e98f2', 'grey30', '#3ef298')
  )
espresso_deseq2_commondowns_genetype_donutplot |> 
  ggsave_multiple_formats(
    outdir = figdir, width = 5, height = 5, fontsize = 7
  )

% of DETs among genetypes

espresso_deseq2_DETs_genetype_summary <- 
  espresso_deseq2_genetype2_isDET |> 
  group_by(genetype2, common_DETs) |> 
  reframe(n = n()) |>
  group_by(genetype2) |>
  mutate(percent = 100 * n / sum(n))
espresso_deseq2_DETs_genetype_summary
## # A tibble: 13 × 4
## # Groups:   genetype2 [6]
##    genetype2        common_DETs     n percent
##    <chr>            <chr>       <int>   <dbl>
##  1 mRNA             down          527   1.69 
##  2 mRNA             other       30234  97.0  
##  3 mRNA             up            401   1.29 
##  4 mt-mRNA          other           2  15.4  
##  5 mt-mRNA          up             11  84.6  
##  6 mt-rRNA          up              2 100    
##  7 mt-tRNA          other           7 100    
##  8 other ncRNAs     down           11   0.238
##  9 other ncRNAs     other        4590  99.4  
## 10 other ncRNAs     up             16   0.347
## 11 unannotated gene down            2   0.218
## 12 unannotated gene other         906  98.9  
## 13 unannotated gene up              8   0.873
espresso_deseq2_DETs_genetype_summary_barplot <- 
  espresso_deseq2_DETs_genetype_summary |> 
  ggplot(aes(x = genetype2 |> fct_rev(), 
             y = percent, fill = common_DETs)) +
  geom_bar(stat = 'identity') +
  coord_flip() +
  scale_y_reverse() +
  scale_fill_manual(values = c('#3e3ef2', 'grey', '#f23e3e'))
espresso_deseq2_DETs_genetype_summary_barplot |> 
  ggsave_multiple_formats(outdir = figdir, width = 5, height = 4, fontsize = 7)

espresso_deseq2_DETs_genetype_summary2 <- 
  espresso_deseq2_genetype2_isDET |> 
  group_by(genetype2, common_DETs) |> 
  reframe(n = n()) |> 
  group_by(common_DETs) |> 
  mutate(percent = 100 * n /sum(n))

espresso_deseq2_DETs_genetype_summary_barplot2 <- 
  espresso_deseq2_DETs_genetype_summary2 |> 
  ggplot(aes(x = common_DETs |> fct_rev(), 
             y = percent, fill = genetype2)) +
  geom_bar(stat = 'identity') +
  coord_flip() +
  scale_y_reverse() +
  scale_fill_manual(
    values = c('#3e98f2', '#f2983e', '#f23e98', '#983ef2', 'grey30', '#3ef298')
  )
espresso_deseq2_DETs_genetype_summary_barplot2 |> 
  ggsave_multiple_formats(outdir = figdir, width = 5, height = 4, fontsize = 7)